package com.niit.spark.sql.test

import org.apache.spark.sql.{DataFrame, SparkSession}

/**
 * Date:2025/5/14
 * Author：Ys
 * Description:
 */
object RemoveDuplicates {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("RemoveDuplicates").master("local[*]").getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    import spark.implicits._

    val df: DataFrame = spark.read.option("header", "true").csv("input/sql/sales1.csv")

    df.drop("id").distinct().show()

    //根据指定列 去除重复数据
    df.dropDuplicates("product_id").show()

    spark.stop()
  }

}
